1 Preparatory steps for the Markdown file

Setting the options for knitr

library(knitr)
knitr::opts_chunk$set(results = 'asis',
                       echo = TRUE,
                       comment = NA,
                       prompt  = FALSE,
                       cache   = FALSE,
                       warning = FALSE,
                       message = FALSE,
                       fig.align = "center",
                       fig.width = 8.125,
                       out.width = "100%",
                       fig.path = "Figures/figures_umap/",
                       dev = c('png', 'tiff'),
                       dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
                       cache = TRUE,
                       cache.path = "D:/cache/cache_umap/",
                       autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')

Loading libraries…

library(tidyverse)
library(gridExtra)
library(kableExtra)
library(uwot) # To compute UMAP
#require(devtools)
#install_version("clues", version = "0.6.2.2", repos = "http://cran.us.r-project.org")
library(clues) # For silhouettes and silhouette scores
library(parallel)

Cleaning the memory…

rm(list = ls(all.names = TRUE))

2 Loading R objects previously defined

Loading data tables and definitions of features sets prepared during the curation phase…

load(file = "Study_objects.RData")

3 Building tools to compute and display UMAP & silhouettes

3.2 Home-made functions to generate UMAP and silhouettes

We define the function get_umap_and_silhouette() which takes as input a data frame (or tibble) which for each predicted observation contains i) either the values of variables describing the observation or the probabilities of each possible output (individual or call type) as returned by a classifier for the observation, ii) the individual, iii) the call type and iv) the call id… The one_hot parameter of the function specifies whether we ‘one-hot encode’ the probabilities returned by a classifier (meaning that the highest probability gets the value 1 and all the others the value 0). This only makes sense when one provides such probabilities, and not raw values for variables describing the calls.

get_umap_and_silhouette <- function(df_umap, target_DV, target_algo, target_set, one_hot = F, n_neighbors = 15, metric = "euclidean", min_dist = 0.01, size = 0.5, alpha = 0.5) {

  set.seed(123) # To ensure reproducibility
  
  df_umap <- df_umap %>%
    filter(algo == target_algo, set == target_set) %>%
    dplyr::select(-algo, -set)

  # We assume that we only consider individual vocal signature and call type
  if (target_DV == "individual")
    other_V <- "type"
  
  if (target_DV == "type")
    other_V <- "individual"
  
  # We create a matrix which contains the numerical values we need for the umap
  m <- umap_coords <- df_umap %>%
    dplyr::select(-id, -individual, -type) %>%
    as.matrix()
  
  # If we 'one-hot' encode the values, we transform the matrix row-wise
  if (one_hot)
    m <- apply(m, 1, function(v) { as.integer(v == max(v)) }) %>% t()

  # We compute a UMAP and turn the results (X and Y coordinates) into a tibble with additional information about individual and call type
  umap_coords <- m %>%
    umap(n_neighbors, n_components = 2, metric = metric, min_dist = min_dist, n_threads = detectCores()-2) %>%
    data.frame() %>%
    as_tibble() %>%
    mutate(individual = df_umap %>% pull(individual),
           type = df_umap %>% pull(type)) 
  
  # We create a ggplot figure to display the UMAP
  p_umap <- umap_coords %>% ggplot(aes(x = X1, y = X2, col = .data[[target_DV]])) +
    geom_point(size = size, alpha = alpha) +
    scale_color_brewer(palette = "Paired") +
    guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1.0))) +
    ggtitle(paste0("UMAP - ", target_algo, "\n", target_set))
  
  # We compute the silhouette for the output of the UMAP
  silhouette <- get_Silhouette(as.matrix(umap_coords[, 1:2]), umap_coords[[target_DV]], disMethod = "Euclidean")
  silhouette.df <- data.frame(sil = silhouette$s, observed = umap_coords[[target_DV]])
  silhouette.df <- silhouette.df[order(silhouette.df$observed, -silhouette.df$sil), ]
  silhouette.df$name <- factor(rownames(silhouette.df), levels = rownames(silhouette.df))
  
  # We compute mean silhouette scores per group, i.e., per level of the target DV
  means.per.group <- silhouette.df %>%
    mutate(name = as.integer(name)) %>%
    group_by(observed) %>%
    summarise(mean.group = round(mean(sil), 2), mean.name = mean(name), min = name[which.min(name)], max = name[which.max(name)])
  
  # We create a ggplot figure to display the silhouette
  sil_plot <- silhouette.df %>%
    ggplot(., aes(x = name, y = sil, color = observed, fill = observed)) +
    geom_col() +
    labs(y = "Silhouette value",
       x = "",
       fill="Cluster",
       color="Cluster",
       title = paste0("Silhouette plot - ", target_algo, "\n", target_set),
       subtitle = paste0("Average silhouette value: ", round(mean(silhouette.df$sil), 2)) ) +
    scale_color_brewer(palette = "Paired") +
    scale_fill_brewer(palette = "Paired") +
    ggplot2::ylim(c(-1, 1)) +
    #geom_hline(yintercept = mean(silhouette.df$sil), linetype = "dashed", color = "red", size=0.75) +
    geom_text(data=means.per.group, aes(x=mean.name, y=mean.group, label= mean.group), color="black", hjust = -0.2) +
    geom_segment(data = means.per.group, aes(x = min - 0.5, xend = max + 0.5, y = mean.group, yend = mean.group), size=0.75, linetype = "dashed", color="black") +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
    coord_flip() +
    guides(fill="none", color="none")
  
  # We return the whole silhouette as well as the two ggplot figures
  return (list(umap = p_umap, silhouette = silhouette, silhouette_plot = sil_plot))
}

4 Loading data for the UMAP & silhouettes

We first define a function build_probability_data() to build the data tables we need to create UMAP and silhouettes. Given a filename, this function loads an .RData file which contains all the outputs generated previously with various classifiers and feature sets. It extracts the prediction probabilities of the classifiers and then builds and returns a data table with the required information…

build_probability_data <- function(filename) {
  
  load(filename)
  
  probabilities <- all_outputs %>% purrr::map_dfr("prediction_probabilities")
  
  sets <- all_outputs %>% purrr::map("sets")
  test_sets <- sets %>% purrr::map("testing_ids")

  nb_per_repetition <- probabilities %>% filter(rep == 1) %>% nrow()
  
  reduced_df <- df %>% select(id, individual, type)

  all_ids <- c()
  for (i in 1:length(test_sets)) {
    ids <- test_sets[[i]] %>% as.character()
    nb_rep <- nb_per_repetition / length(ids)
    all_ids <- c(all_ids, rep(ids, nb_rep))
  }
  
  probabilities <- probabilities %>%
    mutate(id = all_ids) %>%
    select(-true_value, -DV, -rep) %>%
    mutate(id = as.factor(id), algo = as.factor(algo), set = as.factor(set)) %>%
    inner_join(reduced_df, by = c("id" = "id"))

  return (probabilities)
}

We then call the previous function twice to prepare data tables for individual signatures and call types, as they were classified with SVM, NN, xgboost and ensemble learners. We also load a file which contains the appropriate data for the pDFA with both individual signatures and call types. first load the results of the assessments of the different models, and create different data tables…

probabilities_individual <- build_probability_data("Result files/results - individual - default.RData")
probabilities_type <- build_probability_data("Result files/results - type - default.RData")
pdfa_data <- readRDS("Result files/pdfa-permuted-probabilities.rds")

5 Investigating UMAP & silhouettes for different configurations

In the following sub-sections, we repeat the same approach for different configurations: we consider first individual signatures then call types, and, for each of these target variables, get UMAP and silhouettes for i) the raw descriptions of the calls with the features from the three sets ‘Bioacoustic’, ‘DCT’ and ‘MFCC’, ii) the predictions of the pDFA with the ‘Bioacoustic’ feature set, iii) the predictions of the best stacked learner. Once all figures have been created, we assemble them in a single figure used for the publication.

5.1 UMAP & silhouettes for individuals

5.1.1 Initial features:

Computing the UMAP and silhouette…

target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)

df_umap <- df %>%
  dplyr::select(one_of(target_columns)) %>%
  mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")

initial_data_ind <- df_umap %>% get_umap_and_silhouette("individual", "Raw features", "Bioacoustic, DCT, MFCC", size = 1)

Displaying the UMAP…

initial_data_ind$umap

Displaying the silhouette…

initial_data_ind$silhouette_plot

5.1.2 pDFA, Bioacoustic set

Computing the UMAP and silhouette…

df_umap <-
  pdfa_data$individual$Bioacoustic %>%
  as_tibble() %>%
  rename(individual = "observed", type = "control", id = "rowname") %>%
  mutate(algo = "pDFA", set = "Bioacoustic")

pdFA_ind <- df_umap %>% get_umap_and_silhouette("individual", "pDFA", "Bioacoustic", one_hot = T)

Displaying the UMAP…

pdFA_ind$umap

Displaying the silhouette…

pdFA_ind$silhouette_plot

5.1.3 Best ensemble

Computing the UMAP and silhouette…

best_ensemble_ind <- probabilities_individual %>% get_umap_and_silhouette("individual", "ensemble", "3 classifiers x 3 sets", one_hot = T)

Displaying the UMAP…

best_ensemble_ind$umap

Displaying the silhouette…

best_ensemble_ind$silhouette_plot

5.1.4 Displaying a synthetic figure of the results

grid.arrange(
  initial_data_ind$umap, pdFA_ind$umap, best_ensemble_ind$umap,
  initial_data_ind$silhouette_plot, pdFA_ind$silhouette_plot, best_ensemble_ind$silhouette_plot,
  nrow = 2, ncol = 3,
  heights = c(0.5, 1)
)

5.2 UMAP & silhouettes for type

5.2.1 Initial features:

Computing the UMAP and silhouette…

target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)
df_umap <- df %>%
  select(one_of(target_columns)) %>%
  mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")

initial_data_type <- df_umap %>% get_umap_and_silhouette("type", "Raw features", "Bioacoustic, DCT, MFCC", size = 1)

Displaying the UMAP…

initial_data_type$umap

Displaying the silhouette…

initial_data_type$silhouette_plot

5.2.2 pDFA, Bioacoustic:

Computing the UMAP and silhouette…

df_umap <- pdfa_data$type$Bioacoustic %>%
  as_tibble() %>%
  rename(individual = "control", type = "observed", id = "rowname") %>%
  mutate(algo = "pDFA", set = "Bioacoustic")

pdFA_type <- df_umap %>% get_umap_and_silhouette("type", "pDFA", "Bioacoustic", one_hot = T)

Displaying the UMAP…

pdFA_type$umap

Displaying the silhouette…

pdFA_type$silhouette_plot

5.2.3 Best ensemble:

Computing the UMAP and silhouette…

best_ensemble_type <- probabilities_type %>% get_umap_and_silhouette("type", "ensemble", "3 classifiers x 3 sets", one_hot = T)

Displaying the UMAP…

best_ensemble_type$umap

Displaying the silhouette…

best_ensemble_type$silhouette_plot

5.2.4 Displaying a synthetic figure of the results

grid.arrange(
  initial_data_type$umap, pdFA_type$umap, best_ensemble_type$umap,
  initial_data_type$silhouette_plot, pdFA_type$silhouette_plot, best_ensemble_type$silhouette_plot,
  nrow = 2, ncol = 3,
  heights = c(0.5, 1)
)

6 Environment

cat(utils::sessionInfo()$R.version$version.string, "<br />", sep = "")

R version 4.1.3 (2022-03-10)

cat("Platform: ", utils::sessionInfo()$platform, "<br />", sep = "")

Platform: x86_64-w64-mingw32/x64 (64-bit)

cat("OS version: ", utils::sessionInfo()$running, "<br />", sep = "")

OS version: Windows 10 x64 (build 22000)

7 Packages

packages.base <- utils::sessionInfo()$basePkgs
packages.others <- names(utils::sessionInfo()$otherPkgs)

packages <- c(packages.base, packages.others)

for(i in 1:length(packages)) {
  reference <- cat("`",
                  packages[i],
                  "`: ",
                  "*",
                  packageDescription(packages[i])$Title,
                  "* version ",
                  packageDescription(packages[i])$Version,
                  "<br />",
                  sep = "")
  reference
}

parallel: Support for Parallel computation in R version 4.1.3
stats: The R Stats Package version 4.1.3
graphics: The R Graphics Package version 4.1.3
grDevices: The R Graphics Devices and Support for Colours and Fonts version 4.1.3
utils: The R Utils Package version 4.1.3
datasets: The R Datasets Package version 4.1.3
methods: Formal Methods and Classes version 4.1.3
base: The R Base Package version 4.1.3
clues: Clustering Method Based on Local version 0.6.2.2
uwot: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction version 0.1.11
Matrix: Sparse and Dense Matrix Classes and Methods version 1.4-1
gridExtra: Miscellaneous Functions for “Grid” Graphics version 2.3
forcats: Tools for Working with Categorical Variables (Factors) version 0.5.1
stringr: Simple, Consistent Wrappers for Common String Operations version 1.4.0
purrr: Functional Programming Tools version 0.3.4
readr: Read Rectangular Text Data version 2.1.2
tidyr: Tidy Messy Data version 1.2.0
tibble: Simple Data Frames version 3.1.6
ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics version 3.3.5
tidyverse: Easily Install and Load the ‘Tidyverse’ version 1.3.1
dplyr: A Grammar of Data Manipulation version 1.0.8
knitr: A General-Purpose Package for Dynamic Report Generation in R version 1.37


  1. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  2. Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎

  3. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  4. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  5. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  6. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  7. Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎